package com.diven.spark.ml.learn.feature

import org.apache.spark.sql.{DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.Binarizer
import org.apache.spark.sql.types.DataTypes
import com.diven.spark.ml.learn.core.BaseTest
import com.diven.spark.ml.learn.core.BaseSpark

object BinarizerTest extends BaseTest {
    
    def apply(): BaseSpark = new BinarizerTest()
    
}

class BinarizerTest extends BaseSpark{
    
    override def execute(dataFrame: DataFrame) = {
        //特征名称
        var feature = "weight"
        var feature_new = "weight_binarizer"
        //数据预处理
        var dataset = dataFrame.select(col(feature).cast(DataTypes.DoubleType))
        //特征二值化
        var transform = new Binarizer()
        .setInputCol(feature)         //待变换的特征
        .setOutputCol(feature_new)    //变换后的特征名称
        .setThreshold(60d)            //阈值
        .transform(dataset)
        //show
        transform.show()
    }
    
}